import solution
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import warnings
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook, show
# this line tells jupyter notebook to put the plots in the notebook.
%matplotlib inline
# this line makes plots prettier on mac retina screens.
%config InlineBackend.figure_format = 'retina'
# read the csv file into a pandas dataframe
golf = pd.read_csv("golf.csv")
# get some basic info on the dataframe
golf.info()
golf.head(5)
golf.tail(5)
# rename columns for consistent format
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rename.html
golf = golf.rename(columns = {'FEDEX CUP POINTS':"FedEx Cup Points",
'AVERAGE DRIVING DISTANCE':'Average Driving Distance',
'Strokes gained tee to green':'Strokes Gained Tee to Green',
'Strokes gained approach to the green':'Strokes Gained Approach to the Green',
'Strokes gained around the green':'Strokes Gained Around the Green',
'Strokes gained putting':'Strokes Gained Putting',
'Strokes gained total':'Strokes Gained Total',
'BIRDIE TO BOGEY RATIO':'Birdie to Bogey Ratio',
'biride or better % from the rough':'Birdie or Better % From the Rough',
'Scrambling average distance to the hole':'Scrambling Average Distance to the Hole'})
# remove blank columns
# http://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.DataFrame.drop.html
golf = golf.drop('Unnamed: 16',axis=1)
golf = golf.drop('Unnamed: 17',axis=1)
golf = golf.drop('Unnamed: 18',axis=1)
golf = golf.drop('Unnamed: 19',axis=1)
golf = golf.drop('Unnamed: 20',axis=1)
golf.head(2)
# start with key summary statistics
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html
golf.describe()
# was getting a warning about font style that made display unattractive
# so, imported warning library at beginning of notebook
# then used this line of code so warning would not be displayed
# only added this after everything else completed
# thanks to Joe Kambourakis for help with the filterwarnings
# https://docs.python.org/3.1/library/warnings.html
warnings.filterwarnings("ignore")
# this generates the pairplots
# http://seaborn.pydata.org/generated/seaborn.pairplot.html
sns.pairplot(golf)
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.corr.html
# http://seaborn.pydata.org/generated/seaborn.heatmap.html
golf_corr = golf.corr()
sns.heatmap(golf_corr)
golf = golf.drop('FedEx Cup Points',axis=1)
golf = golf.drop('Average Driving Distance',axis=1)
golf = golf.drop('Strokes Gained Total',axis=1)
golf = golf.drop('Birdie to Bogey Ratio',axis=1)
golf = golf.drop('birdie or better % from the rough',axis=1)
golf = golf.drop('Scrambling Average Distance to the Hole',axis=1)
golf = golf.drop('World Ranking',axis=1)
golf = golf.drop('Scoring Average',axis=1)
golf = golf.drop('Percent of Available Purse Won',axis=1)
sns.pairplot(golf, kind='reg')
# correlation matrix
# erik ellis (from data science bootcamp class)
# helped me get started on the concept of this layout
# I then spent a bunch of time with the documentation at:
# http://seaborn.pydata.org/generated/seaborn.heatmap.html
golf_corr = golf.corr()
mask = np.zeros_like(golf_corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
ax = sns.heatmap(golf_corr, mask=mask, square=True, annot=True, linewidths=0.5)
# create a new DataFrame with just Money Won, Driving, Putting
golf_chart = golf
golf_chart = golf_chart.drop('Strokes Gained Approach to the Green',axis=1)
golf_chart = golf_chart.drop('Strokes Gained Around the Green',axis=1)
golf_chart = golf_chart.drop('Strokes Gained Tee to Green',axis=1)
# dataframe with correlation data
golf_chart_corr = golf_chart.corr()
# output a simple bar chart of correlations with catchy title
# http://pandas.pydata.org/pandas-docs/stable/visualization.html
ax = golf_chart_corr['Official Money Won'].sort_values(ascending=True).plot.barh()
ax.set_xlabel('Correlation Coefficients', fontsize=12)
ax.set_title('"Drive for Show, Putt for Dough" is a Myth', fontsize=18)
plt.show()
# output a simple bar chart of correlations with catchy title
# use the golf_corr dataframe from earlier that has all the Strokes Gained Variables
ax = golf_corr['Official Money Won'].sort_values(ascending=True).plot.barh()
ax.set_xlabel('Correlation Coefficients', fontsize=12)
ax.set_title('Strokes Gained Tee to Green = "Show Me the Money" ', fontsize=18)
plt.show()
# create a new DataFrame with just Money Won, Driving, Putting, Tee to Green
golf_pairplot = golf
golf_pairplot = golf_pairplot.drop('Strokes Gained Approach to the Green',axis=1)
golf_pairplot = golf_pairplot.drop('Strokes Gained Around the Green',axis=1)
sns.set(style="ticks", color_codes=True)
sns.pairplot(golf_pairplot, kind='reg')
# this bokeh code allows us to box zoom and zero in on portions of the scatter plot
# we imported libraries at top of notebook; had we not, we would need the following here:
# from bokeh.plotting import figure
# from bokeh.io import output_notebook, show
# http://bokeh.pydata.org/en/latest/docs/user_guide.html
# http://bokeh.pydata.org/en/latest/docs/user_guide/plotting.html
strokes = golf['Strokes Gained Tee to Green'] # x-values
money = golf['Official Money Won'] # y-values
# Set up the figure
p = figure(plot_width=600,
plot_height=400,
x_axis_label='Strokes Gained Tee to Green',
y_axis_label='Official Money Won',
title="Strokes Gained Tee to Green versus Official Money Won")
p.circle(strokes, money)
output_notebook()
show(p)
# convert Official Money Won to $ Thousands for better label display
golf['Official Money Won'] = golf["Official Money Won"].map(lambda x: int(x/1000))
golf['Strokes Gained Tee to Green'] = (golf['Strokes Gained Tee to Green']
.map(lambda x: round(x,3)))
golf.head(2)
# let's create a hover tool! very cool!
# we imported libraries at top of notebook; had we not, we would need the following here:
# from bokeh.plotting import figure
# from bokeh.io import output_notebook, show
# from bokeh.plotting import figure, output_file, show, ColumnDataSource
# from bokeh.models import HoverTool
# http://bokeh.pydata.org/en/latest/docs/user_guide/tools.html#hover-tool
# output_file("toolbar.html")
# inspiration for this chart came from data science bootcamp
source = ColumnDataSource(
data=dict(
strokes = golf['Strokes Gained Tee to Green'],
money = golf['Official Money Won'],
name= golf['PLAYER NAME'],
)
)
hover = HoverTool(
tooltips=[
("name", "@name"),
("(strokes,money)", "@strokes, @money"),
]
)
p = figure(plot_width=600, plot_height=400, tools=[hover],
title="Strokes Gained Tee to Green versus Official Money Won")
p.xaxis.axis_label = "Strokes Gained"
p.yaxis.axis_label = "Money Won (In $Thousand)"
p.xaxis.bounds = (-3,3)
p.circle('strokes', 'money', size=4, source=source)
output_notebook()
show(p)
source = ColumnDataSource(
data=dict(
strokes = golf['Strokes Gained Off the Tee'],
money = golf['Official Money Won'],
name= golf['PLAYER NAME'],
)
)
hover = HoverTool(
tooltips=[
("name", "@name"),
("(strokes,money)", "@strokes, @money"),
]
)
p = figure(plot_width=550, plot_height=350, tools=[hover],
title="Strokes Gained Off the Tee versus Money Won")
p.xaxis.axis_label = "Strokes Gained"
p.yaxis.axis_label = "Money Won (In $Thousand)"
p.circle('strokes', 'money', size=3, source=source)
output_notebook()
show(p)
source = ColumnDataSource(
data=dict(
strokes = golf['Strokes Gained Putting'],
money = golf['Official Money Won'],
name= golf['PLAYER NAME'],
)
)
hover = HoverTool(
tooltips=[
("name", "@name"),
("(strokes,money)", "@strokes, @money"),
]
)
p = figure(plot_width=550, plot_height=350, tools=[hover],
title="Strokes Gained Putting versus Money Won")
p.xaxis.axis_label = "Strokes Gained"
p.yaxis.axis_label = "Money Won (In $Thousand)"
p.circle('strokes', 'money', size=3, source=source)
output_notebook()
show(p)
# https://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.log.html
golf['Log of Official Money Won'] = golf["Official Money Won"].apply(np.log)
golf.head(2)
# same code as earlier
golf_corr = golf.corr()
mask = np.zeros_like(golf_corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
ax = sns.heatmap(golf_corr, mask=mask, square=True, annot=True, linewidths=0.5)
sns.pairplot(golf, kind = 'reg')
# http://seaborn.pydata.org/generated/seaborn.lmplot.html
sns.lmplot(x='Strokes Gained Tee to Green', y='Log of Official Money Won', data=golf)
plt.show()
# this is really interesting because the lmplot allow us to
# It is intended as a convenient interface to fit regression models across conditional subsets of a dataset.
# we are seeing a linear relationship now that we have log of official money won